#load the necessary packages:
import numpy as np
import pandas as pd
import cimcb as cb
from sklearn.model_selection import train_test_split
import seaborn as sns
np.random.seed(42)
print('All packages successfully loaded')
Using Theano backend.
All packages successfully loaded
# load the necessary file:
file = 'MTBLS136.xlsx'
DataTable,PeakTable = cb.utils.load_dataXL(file, DataSheet='Data', PeakSheet='Peak')
Loadings PeakFile: Peak Loadings DataFile: Data Data Table & Peak Table is suitable. TOTAL SAMPLES: 1649 TOTAL PEAKS: 949 Done!
DataTable
| Idx | SampleID | Class | Hormone | M1 | M2 | M3 | M4 | M5 | M6 | ... | M940 | M941 | M942 | M943 | M944 | M945 | M946 | M947 | M948 | M949 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | 063942_01 | 2.0 | Nonuser | NaN | NaN | NaN | 95695.0 | 311381856 | 4701657.0 | ... | NaN | 641183.0 | NaN | 29736404.0 | 146489552.0 | 25615702.0 | 125249.0 | 209356.0 | 1406015.0 | 8034717.0 |
| 2 | 2 | 037523_02 | 0.0 | E-only | NaN | NaN | NaN | 503202.0 | 98715432 | 753959.0 | ... | 2544193.0 | 548504.0 | NaN | NaN | NaN | 39473920.0 | 192490.0 | 156234.0 | 808012.0 | 1031587.0 |
| 3 | 3 | 073016_03 | 0.0 | E-only | NaN | NaN | NaN | 104038.0 | 145110144 | NaN | ... | 403755.0 | 768151.0 | NaN | NaN | NaN | 25694854.0 | 233899.0 | 129411.0 | 1166341.0 | 630163.0 |
| 4 | 4 | 061210_01 | 2.0 | Nonuser | NaN | NaN | NaN | 366910.0 | 156989408 | 549563.0 | ... | 561696.0 | 1008375.0 | NaN | NaN | NaN | 26806070.0 | 99558.0 | NaN | 2180112.0 | 1008026.0 |
| 5 | 5 | 063862_01 | 0.0 | E-only | NaN | NaN | NaN | 608979.0 | 140037936 | 1326962.0 | ... | 48161.0 | 750182.0 | NaN | NaN | NaN | 33121192.0 | NaN | 125668.0 | 552264.0 | 1529796.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1645 | 1645 | IU_4945_1001 | NaN | NaN | NaN | NaN | NaN | 115469.0 | 147413616 | 576361.0 | ... | 144390.0 | 338859.0 | NaN | NaN | NaN | 24838220.0 | 99708.0 | 64767.0 | 1244962.0 | 1651202.0 |
| 1646 | 1646 | IU_5881_1000 | NaN | NaN | NaN | NaN | NaN | NaN | 141172224 | NaN | ... | NaN | 511543.0 | NaN | NaN | NaN | 57732092.0 | 160562.0 | NaN | 1373740.0 | 475267.0 |
| 1647 | 1647 | IU_5881_1001 | NaN | NaN | NaN | NaN | NaN | NaN | 145479904 | 221693.0 | ... | NaN | 563088.0 | NaN | NaN | NaN | 68145600.0 | 172044.0 | 57869.0 | 1522911.0 | 460609.0 |
| 1648 | 1648 | IU_4945_1000 | NaN | NaN | NaN | NaN | NaN | 478657.0 | 156384400 | 731069.0 | ... | 166907.0 | 418370.0 | NaN | NaN | NaN | 26290880.0 | 116872.0 | NaN | 1116896.0 | 1980565.0 |
| 1649 | 1649 | IU_5193_1000 | NaN | NaN | NaN | NaN | NaN | NaN | 148171072 | 484084.0 | ... | 41187.0 | 392013.0 | NaN | NaN | NaN | 26832696.0 | 85120.0 | 71943.0 | 1281148.0 | 1059509.0 |
1649 rows × 953 columns
#the Idx column is dropped and SampleID column is made index.
DataTable= DataTable.drop(["Idx"], axis= 1)
DataTable= DataTable.set_index("SampleID")
# heatmap for observing some missing values.
cols = DataTable.columns
colors = ['#008000', '#ff0000']
sns.heatmap(DataTable[cols].isnull(), cmap=sns.color_palette(['#1f77b4', '#ff0000']),
cbar=False, linecolor='gray')
<AxesSubplot:ylabel='SampleID'>
# see if any NA in the whole df:
DataTable.isna().sum().sum()
310432
# randomly remove 10% of the values:
mask_10 = np.random.choice([True,False],size=DataTable.shape,p=[0.1,0.9])
miss_10_DataTable = DataTable.mask(mask_10,other=np.nan)
miss_10_DataSet = miss_10_DataTable.copy()# used later on for removing irrelevant rows
# randomly remove 50% of the values
mask_50 = np.random.choice([True,False],size=DataTable.shape,p=[0.5,0.5])
miss_50_DataTable = DataTable.mask(mask_50,other=np.nan)
miss_50_DataSet = miss_50_DataTable.copy()# used later on for removing irrelevant rows
# randomly change 10% of the values
change_10 = np.random.randint(low= 500000, high= 8000000,size= DataTable.shape)
change_10_DataTable = DataTable.mask(mask_10,other=change_10)
change_10_DataSet = change_10_DataTable.copy()# used later on for removing irrelevant data
# randomly change 50% of the values
change_50 = np.random.randint(low= 500000, high= 8000000,size= DataTable.shape)
change_50_DataTable = DataTable.mask(mask_50,other=change_50)
change_50_DataSet = change_50_DataTable.copy()# used later on for removing irrelevant data
# heatmap for observing some missing values.
cols = DataTable.columns
colors = ['#008000', '#ff0000']
sns.heatmap(DataTable[cols].isnull(), cmap=sns.color_palette(['#1f77b4', '#ff0000']),
cbar=False, linecolor='gray')
<AxesSubplot:ylabel='SampleID'>
# heatmap for observing some missing values.
cols = change_50_DataSet.columns
colors = ['#008000', '#ff0000']
sns.heatmap(change_50_DataSet[cols].isnull(), cmap=sns.color_palette(['#1f77b4', '#ff0000']),
cbar=False, linecolor='gray')
<AxesSubplot:ylabel='SampleID'>
# impute the missing values and create the missing value indicator variables for each numeric column.
df_numeric = miss_10_DataTable._get_numeric_data().copy()
numeric_cols = df_numeric.columns.values
#impute using median for numberic columns:
for col in numeric_cols:
df_numeric[col].fillna(df_numeric[col].median(), inplace=True)
#impute using mode() for non numeric columns
df_non_numeric = miss_10_DataTable.select_dtypes(exclude=np.number).copy()
non_numeric_cols = df_non_numeric.columns.values
for col in non_numeric_cols:
df_non_numeric[col].fillna(df_non_numeric[col].mode()[0], inplace=True)
#concatinate both imputed non numeric and numeric data:
imputed_10_miss = pd.concat([df_non_numeric,df_numeric], axis=1, join='inner')
#plot the missing vs no missing after imputation:
cols = imputed_10_miss.columns
colors = ['#008000', '#ff0000']
sns.heatmap(imputed_10_miss[cols].isnull(), cmap=sns.color_palette(['#1f77b4', '#ff0000']),
cbar=False, linecolor='gray')
<AxesSubplot:ylabel='SampleID'>
# check if any missings remain in the data:
imputed_10_miss.columns[imputed_10_miss.isna().any()].tolist()
[]
# Clean PeakTable and Extract PeakList
PercMiss = PeakTable['Perc_missing']
PeakTableClean = PeakTable[(PercMiss < 20)]
PeakList = PeakTableClean['Name']
# Select Subset of Data
DataTable2 = imputed_10_miss[(imputed_10_miss.Class == 1) | (imputed_10_miss.Class == 0)]
# Create a Binary Y Vector
Outcomes = DataTable2['Class']
Y = Outcomes.values
# Split Data into Train (2/3) and Test (1/3)
DataTrain, DataTest, YTrain, YTest = train_test_split(DataTable2, Y, test_size=1/3, stratify=Y, random_state=85)
# Extract Train Data
XTrain = DataTrain[PeakList]
XTrainLog = np.log(XTrain)
XTrainScale, mu, sigma = cb.utils.scale(XTrainLog, method='auto', return_mu_sigma=True)
XTrainKnn = XTrainScale# cb.utils.knnimpute(XTrainScale, k=3)
# Extract Test Data
XTest = DataTest[PeakList]
XTestLog = np.log(XTest)
XTestScale = cb.utils.scale(XTestLog, method='auto', mu=mu, sigma=sigma)
XTestKnn = XTestScale #cb.utils.knnimpute(XTestScale, k=3)
# Parameter Dictionary
C_range = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]
param_dict = dict(C=C_range, kernel="linear")
# Initialise
cv = cb.cross_val.KFold(model=cb.model.SVM,
X=XTrainKnn,
Y=YTrain,
param_dict=param_dict,
folds=5,
n_mc=10,
n_cores=30)
# Run and Plot
cv.run()
cv.plot(metric='auc')
cv.plot(metric='r2q2')
Running ...
1/2: 100%|███████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 304.37it/s] 2/2: 100%|██████████████████████████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 71.17it/s]
Time taken: 1.29 minutes with 30 cores Done!
# Parameter Dictionary
C_range = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]
param_dict = dict(C=C_range, kernel="linear")
# Initialise
cv = cb.cross_val.KFold(model=cb.model.SVM,
X=XTrainKnn,
Y=YTrain,
param_dict=param_dict,
folds=5,
n_mc=10,
n_cores=30)
# Run and Plot
cv.run()
cv.plot(metric='auc')
cv.plot(metric='r2q2')
Running ...
1/2: 100%|█████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 11550.01it/s] 2/2: 100%|██████████████████████████████████████████████████████████████████████████████| 70/70 [00:01<00:00, 48.36it/s]
Time taken: 0.08 minutes with 30 cores Done!
# Parameter Dictionary
#C_range = [1e-6,1e-5,1e-4,5e-4,1e-3,5e-3,1e-2,1e-1]
C_range = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001]
param_dict = dict(C=C_range, kernel="linear")
# Initialise
cv = cb.cross_val.KFold(model=cb.model.SVM,
X=XTrainKnn,
Y=YTrain,
param_dict=param_dict,
folds=5,
n_mc=10,
n_cores=30)
# Run and Plot
cv.run()
cv.plot(metric='auc')
cv.plot(metric='r2q2')
Running ...
1/2: 100%|███████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 14634.70it/s] 2/2: 100%|████████████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 43.82it/s]
Time taken: 0.11 minutes with 30 cores Done!
# Build Model
model = cb.model.SVM(C=0.0005, kernel="linear")
YPredTrain = model.train(XTrainKnn, YTrain)
YPredTest = model.test(XTestKnn)
# Put YTrain and YPredTrain in a List
EvalTrain = [YTrain, YPredTrain]
# Put YTest and YPrestTest in a List
EvalTest = [YTest, YPredTest]
# Evaluate Model (include Test Dataset)
model.evaluate(testset=EvalTest)
# Extract X Data
XBoot = DataTable2[PeakList]
XBootLog = np.log(XBoot)
XBootScale = cb.utils.scale(XBootLog, method='auto')
#XBootKnn = XBootScale #cb.utils.knnimpute(XBootScale, k=3)
YPredBoot = model.train(XBootScale, Y)
# Build Boostrap Models
bootmodel = cb.bootstrap.Per(model, bootnum=100)
bootmodel.run()
# Boostrap Evaluate Model (include Test Dataset)
bootmodel.evaluate(trainset=EvalTrain, testset=EvalTest)
coef = model.model.coef_ # extract coefficient
important_feat = abs(coef[0])
#get indices of those important features
idx = important_feat.argsort(kind= "quicksort")
idx= idx[::-1][:5]
#get more information from peaktable:
top_met = PeakList.iloc[idx]
top_met_info = PeakTable.iloc[top_met.index-1]
#5 most important metabolites
top_met_info
| Idx | Name | Label | Perc_missing | |
|---|---|---|---|---|
| 41 | 41 | M41 | 1-linoleoyl-GPA (18:2)* | 0.303214 |
| 618 | 618 | M618 | lysine | 0.000000 |
| 123 | 123 | M123 | 2-aminoheptanoate | 0.000000 |
| 476 | 476 | M476 | gamma-glutamyl-alpha-lysine | 0.060643 |
| 576 | 576 | M576 | Isobar: fructose 6-phosphate, fructose 1-phosp... | 2.546998 |
# Parameter Dictionary
depth = list(range(1,11))
leaf_asfraction = [0.01,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5]
param_dict = dict(max_depth=depth,
min_samples_leaf=leaf_asfraction,
max_features='sqrt',
criterion='gini',
min_samples_split=2,
max_leaf_nodes=None,
n_estimators=100)
# Initialise
cv = cb.cross_val.KFold(model=cb.model.RF,
X=XTrainKnn,
Y=YTrain,
param_dict=param_dict,
folds=5,
n_mc=10,
n_cores=30)
# Run and Plot
cv.run()
cv.plot(metric='auc', color_beta=[5,5,3])
cv.plot(metric='r2q2', color_beta=[5,5,3])
Running ...
1/2: 0%| | 0/110 [00:00<?, ?it/s]Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. 1/2: 100%|████████████████████████████████████████████████████████████████████████████| 110/110 [01:05<00:00, 1.67it/s] 2/2: 100%|██████████████████████████████████████████████████████████████████████████| 1100/1100 [00:52<00:00, 20.86it/s]
Time taken: 2.07 minutes with 30 cores Done!
/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/utils/color_scale.py:9: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version. Convert to a numpy array before indexing instead. x_init = scaler.fit_transform(x[:, np.newaxis]).flatten() /home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/utils/color_scale.py:9: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version. Convert to a numpy array before indexing instead. x_init = scaler.fit_transform(x[:, np.newaxis]).flatten() /home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/utils/color_scale.py:9: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version. Convert to a numpy array before indexing instead. x_init = scaler.fit_transform(x[:, np.newaxis]).flatten()
/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/utils/color_scale.py:9: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version. Convert to a numpy array before indexing instead. x_init = scaler.fit_transform(x[:, np.newaxis]).flatten() /home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/utils/color_scale.py:9: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version. Convert to a numpy array before indexing instead. x_init = scaler.fit_transform(x[:, np.newaxis]).flatten() /home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/utils/color_scale.py:9: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version. Convert to a numpy array before indexing instead. x_init = scaler.fit_transform(x[:, np.newaxis]).flatten()
# Build Model
model = cb.model.RF(max_depth=7,
min_samples_leaf=0.05,
max_features='sqrt',
criterion='gini',
min_samples_split=2,
max_leaf_nodes=None,
n_estimators=100)
YPredTrain = model.train(XTrainKnn, YTrain)
YPredTest = model.test(XTestKnn)
# Put YTrain and YPredTrain in a List
EvalTrain = [YTrain, YPredTrain]
# Put YTest and YPrestTest in a List
EvalTest = [YTest, YPredTest]
# Evaluate Model (include Test Dataset
model.evaluate(testset=EvalTest)
# Extract X Data
XBoot = DataTable2[PeakList]
XBootLog = np.log(XBoot)
XBootScale = cb.utils.scale(XBootLog, method='auto')
#XBootKnn = XBootScale #cb.utils.knnimpute(XBootScale, k=3)
YPredBoot = model.train(XBootScale, Y)
# Build Boostrap Models
bootmodel = cb.bootstrap.Per(model, bootnum=100)
bootmodel.run()
# Boostrap Evaluate Model (include Test Dataset)
bootmodel.evaluate(trainset=EvalTrain, testset=EvalTest)
# for important metabolies / features:
important_feat = model.model.feature_importances_
important_feat = abs(important_feat)
#get indices of those important features
idx = important_feat.argsort(kind= "quicksort")
idx= idx[::-1][:5]
#get more information from peaktable:
top_met = PeakList.iloc[idx]
top_met_info = PeakTable.iloc[top_met.index-1]
#5 most important metabolites
top_met_info
| Idx | Name | Label | Perc_missing | |
|---|---|---|---|---|
| 41 | 41 | M41 | 1-linoleoyl-GPA (18:2)* | 0.303214 |
| 742 | 742 | M742 | oleoyl-arachidonoyl-glycerol (18:1/20:4) [2]* | 0.363857 |
| 476 | 476 | M476 | gamma-glutamyl-alpha-lysine | 0.060643 |
| 123 | 123 | M123 | 2-aminoheptanoate | 0.000000 |
| 143 | 143 | M143 | 2-linoleoylglycerol (18:2) | 0.545785 |
# impute the missing values and create the missing value indicator variables for each numeric column.
df_numeric = miss_50_DataTable._get_numeric_data().copy()
numeric_cols = df_numeric.columns.values
# imputation for numeric values:
for col in numeric_cols:
df_numeric[col].fillna(df_numeric[col].median(), inplace=True)
/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1117: RuntimeWarning: Mean of empty slice return np.nanmean(a, axis, out=out, keepdims=keepdims) /home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1117: RuntimeWarning: Mean of empty slice return np.nanmean(a, axis, out=out, keepdims=keepdims) /home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1117: RuntimeWarning: Mean of empty slice return np.nanmean(a, axis, out=out, keepdims=keepdims) /home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1117: RuntimeWarning: Mean of empty slice return np.nanmean(a, axis, out=out, keepdims=keepdims) /home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1117: RuntimeWarning: Mean of empty slice return np.nanmean(a, axis, out=out, keepdims=keepdims) /home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1117: RuntimeWarning: Mean of empty slice return np.nanmean(a, axis, out=out, keepdims=keepdims) /home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1117: RuntimeWarning: Mean of empty slice return np.nanmean(a, axis, out=out, keepdims=keepdims) /home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/numpy/lib/nanfunctions.py:1117: RuntimeWarning: Mean of empty slice return np.nanmean(a, axis, out=out, keepdims=keepdims)
#check if anything nans remaining due to only nans in data, remove them
only_missings= df_numeric.columns[df_numeric.isna().any()].tolist()# remove them because of too many NANs in column.
df_numeric= df_numeric.drop(only_missings, axis=1)
#for non-numeric columns using mode:
df_non_numeric = miss_50_DataTable.select_dtypes(exclude=np.number).copy()
non_numeric_cols = df_non_numeric.columns.values
for col in non_numeric_cols:
df_non_numeric[col].fillna(df_non_numeric[col].mode()[0], inplace=True)
#concatinate the non and numeric df:
imputed_50_miss = pd.concat([df_non_numeric,df_numeric], axis=1, join='inner')
#plot the missing vs no missing after imputation:
import matplotlib.pyplot as plt
cols = imputed_50_miss.columns
colors = ['#008000', '#ff0000']
m_heatmap= sns.heatmap(imputed_50_miss[cols].isnull(), cmap=sns.color_palette(['#1f77b4', '#ff0000']),
cbar=False, linecolor='gray')
m_heatmap.set_xlabel('Features')
m_heatmap.set_title('Heatmap after median imputation for 50% removed data')
m_heatmap.legend(title='missing: red, present: blue', labels=['Present', 'Missing'], loc='upper right')
<matplotlib.legend.Legend at 0x7f59e1aad8d0>
imputed_50_miss.columns[imputed_50_miss.isna().any()].tolist()
[]
# Clean PeakTable and Extract PeakList
PercMiss = PeakTable['Perc_missing']
PeakTableClean = PeakTable[(PercMiss < 20)]
PeakList = PeakTableClean['Name']
# Select Subset of Data
DataTable2 = imputed_50_miss[(imputed_50_miss.Class == 1) | (imputed_50_miss.Class == 0)]
# Create a Binary Y Vector
Outcomes = DataTable2['Class']
Y = Outcomes.values
# Split Data into Train (2/3) and Test (1/3)
DataTrain, DataTest, YTrain, YTest = train_test_split(DataTable2, Y, test_size=1/3, stratify=Y, random_state=85)
# Extract Train Data
XTrain = DataTrain[PeakList]
XTrainLog = np.log(XTrain)
XTrainScale, mu, sigma = cb.utils.scale(XTrainLog, method='auto', return_mu_sigma=True)
XTrainKnn = XTrainScale# cb.utils.knnimpute(XTrainScale, k=3)
# Extract Test Data
XTest = DataTest[PeakList]
XTestLog = np.log(XTest)
XTestScale = cb.utils.scale(XTestLog, method='auto', mu=mu, sigma=sigma)
XTestKnn = XTestScale #cb.utils.knnimpute(XTestScale, k=3)
# Parameter Dictionary
C_range = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]
param_dict = dict(C=C_range, kernel="linear")
# Initialise
cv = cb.cross_val.KFold(model=cb.model.SVM,
X=XTrainKnn,
Y=YTrain,
param_dict=param_dict,
folds=5,
n_mc=10,
n_cores=30)
# Run and Plot
cv.run()
cv.plot(metric='auc')
cv.plot(metric='r2q2')
Running ...
1/2: 0%| | 0/7 [00:00<?, ?it/s]Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. 1/2: 100%|████████████████████████████████████████████████████████████████████████████████| 7/7 [00:42<00:00, 6.03s/it] 2/2: 100%|█████████████████████████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 190.28it/s]
Time taken: 1.64 minutes with 30 cores Done!
# Parameter Dictionary
C_range = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]
param_dict = dict(C=C_range, kernel="linear")
# Initialise
cv = cb.cross_val.KFold(model=cb.model.SVM,
X=XTrainKnn,
Y=YTrain,
param_dict=param_dict,
folds=5,
n_mc=10,
n_cores=30)
# Run and Plot
cv.run()
cv.plot(metric='auc')
cv.plot(metric='r2q2')
Running ...
1/2: 100%|██████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 8340.95it/s] 2/2: 100%|█████████████████████████████████████████████████████████████████████████████| 70/70 [00:00<00:00, 159.92it/s]
Time taken: 0.03 minutes with 30 cores Done!
# Parameter Dictionary
#C_range = [1e-6,1e-5,1e-4,5e-4,1e-3,5e-3,1e-2,1e-1]
C_range = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001]
param_dict = dict(C=C_range, kernel="linear")
# Initialise
cv = cb.cross_val.KFold(model=cb.model.SVM,
X=XTrainKnn,
Y=YTrain,
param_dict=param_dict,
folds=5,
n_mc=10,
n_cores=30)
# Run and Plot
cv.run()
cv.plot(metric='auc')
cv.plot(metric='r2q2')
Running ...
1/2: 100%|███████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 12572.85it/s] 2/2: 100%|███████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 113.39it/s]
Time taken: 0.03 minutes with 30 cores Done!
# Build Model
model = cb.model.SVM(C=0.0005, kernel="linear")
YPredTrain = model.train(XTrainKnn, YTrain)
YPredTest = model.test(XTestKnn)
# Put YTrain and YPredTrain in a List
EvalTrain = [YTrain, YPredTrain]
# Put YTest and YPrestTest in a List
EvalTest = [YTest, YPredTest]
# Evaluate Model (include Test Dataset)
model.evaluate(testset=EvalTest)
# Extract X Data
XBoot = DataTable2[PeakList]
XBootLog = np.log(XBoot)
XBootScale = cb.utils.scale(XBootLog, method='auto')
#XBootKnn = XBootScale #cb.utils.knnimpute(XBootScale, k=3)
YPredBoot = model.train(XBootScale, Y)
# Build Boostrap Models
bootmodel = cb.bootstrap.Per(model, bootnum=100)
bootmodel.run()
# Boostrap Evaluate Model (include Test Dataset)
bootmodel.evaluate(trainset=EvalTrain, testset=EvalTest)
coef = model.model.coef_ # extract coefficient
important_feat = abs(coef[0])
#get indices of those important features
idx = important_feat.argsort(kind= "quicksort")
idx= idx[::-1][:5]
#get more information from peaktable:
top_met = PeakList.iloc[idx]
top_met_info = PeakTable.iloc[top_met.index-1]
#5 most important metabolites
top_met_info
| Idx | Name | Label | Perc_missing | |
|---|---|---|---|---|
| 907 | 907 | M907 | theophylline | 14.918132 |
| 910 | 910 | M910 | threonine | 0.000000 |
| 305 | 305 | M305 | androstenediol (3beta,17beta) disulfate (2) | 0.000000 |
| 39 | 39 | M39 | 1-linoleoyl-2-arachidonoyl-GPC (18:2/20:4n6)* | 1.030928 |
| 122 | 122 | M122 | 2-aminobutyrate | 0.000000 |
# Parameter Dictionary
depth = list(range(1,11))
leaf_asfraction = [0.01,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5]
param_dict = dict(max_depth=depth,
min_samples_leaf=leaf_asfraction,
max_features='sqrt',
criterion='gini',
min_samples_split=2,
max_leaf_nodes=None,
n_estimators=100)
# Initialise
cv = cb.cross_val.KFold(model=cb.model.RF,
X=XTrainKnn,
Y=YTrain,
param_dict=param_dict,
folds=5,
n_mc=10,
n_cores=30)
# Run and Plot
cv.run()
cv.plot(metric='auc', color_beta=[5,5,3])
cv.plot(metric='r2q2', color_beta=[5,5,3])
Running ...
1/2: 0%| | 0/110 [00:00<?, ?it/s]Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. 1/2: 100%|████████████████████████████████████████████████████████████████████████████| 110/110 [01:44<00:00, 1.05it/s] 2/2: 100%|██████████████████████████████████████████████████████████████████████████| 1100/1100 [00:43<00:00, 25.17it/s]
Time taken: 2.54 minutes with 30 cores Done!
/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/utils/color_scale.py:9: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version. Convert to a numpy array before indexing instead. x_init = scaler.fit_transform(x[:, np.newaxis]).flatten() /home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/utils/color_scale.py:9: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version. Convert to a numpy array before indexing instead. x_init = scaler.fit_transform(x[:, np.newaxis]).flatten() /home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/utils/color_scale.py:9: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version. Convert to a numpy array before indexing instead. x_init = scaler.fit_transform(x[:, np.newaxis]).flatten()
/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/utils/color_scale.py:9: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version. Convert to a numpy array before indexing instead. x_init = scaler.fit_transform(x[:, np.newaxis]).flatten() /home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/utils/color_scale.py:9: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version. Convert to a numpy array before indexing instead. x_init = scaler.fit_transform(x[:, np.newaxis]).flatten() /home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/utils/color_scale.py:9: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version. Convert to a numpy array before indexing instead. x_init = scaler.fit_transform(x[:, np.newaxis]).flatten()
# Build Model
model = cb.model.RF(max_depth=7,
min_samples_leaf=0.05,
max_features='sqrt',
criterion='gini',
min_samples_split=2,
max_leaf_nodes=None,
n_estimators=100)
YPredTrain = model.train(XTrainKnn, YTrain)
YPredTest = model.test(XTestKnn)
# Put YTrain and YPredTrain in a List
EvalTrain = [YTrain, YPredTrain]
# Put YTest and YPrestTest in a List
EvalTest = [YTest, YPredTest]
# Evaluate Model (include Test Dataset
model.evaluate(testset=EvalTest)
# Extract X Data
XBoot = DataTable2[PeakList]
XBootLog = np.log(XBoot)
XBootScale = cb.utils.scale(XBootLog, method='auto')
XBootKnn = XBootScale #cb.utils.knnimpute(XBootScale, k=3)
YPredBoot = model.train(XBootKnn, Y)
# Build Boostrap Models
bootmodel = cb.bootstrap.Per(model, bootnum=100)
bootmodel.run()
# Boostrap Evaluate Model (include Test Dataset)
bootmodel.evaluate(trainset=EvalTrain, testset=EvalTest)
# for important metabolies / features:
important_feat = model.model.feature_importances_
important_feat = abs(important_feat)
#get indices of those important features
idx = important_feat.argsort(kind= "quicksort")
idx= idx[::-1][:5]
#get more information from peaktable:
top_met = PeakList.iloc[idx]
top_met_info = PeakTable.iloc[top_met.index-1]
#5 most important metabolites
top_met_info
| Idx | Name | Label | Perc_missing | |
|---|---|---|---|---|
| 86 | 86 | M86 | 1-stearoyl-2-arachidonoyl-GPI (18:0/20:4) | 0.060643 |
| 29 | 29 | M29 | 1-arachidonoyl-GPC (20:4n6)* | 0.000000 |
| 613 | 613 | M613 | linoleoyl-linoleoyl-glycerol (18:2/18:2) [1]* | 0.000000 |
| 221 | 221 | M221 | 4-guanidinobutanoate | 0.000000 |
| 77 | 77 | M77 | 1-palmitoyl-GPA (16:0) | 2.910855 |
Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend. Using Theano backend.
#irr_rem_change_10_DataSet= change_50_DataSet.drop(['M464'], axis=1)
irr_rem_change_10_DataSet= change_10_DataSet.dropna()
irr_rem_change_10_DataSet
| Class | Hormone | M1 | M2 | M3 | M4 | M5 | M6 | M7 | M8 | ... | M940 | M941 | M942 | M943 | M944 | M945 | M946 | M947 | M948 | M949 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SampleID |
0 rows × 951 columns
miss_10_DataSet.dropna()
| Class | Hormone | M1 | M2 | M3 | M4 | M5 | M6 | M7 | M8 | ... | M940 | M941 | M942 | M943 | M944 | M945 | M946 | M947 | M948 | M949 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SampleID |
0 rows × 951 columns
#########################################
# Clean PeakTable and Extract PeakList
PercMiss = PeakTable['Perc_missing']
PeakTableClean = PeakTable[(PercMiss < 20)]
PeakList = PeakTableClean['Name']
# Select Subset of Data
DataTable2 = miss_10_DataSet[(miss_10_DataSet.Class == 1) | (miss_10_DataSet.Class == 0)]
# Create a Binary Y Vector
Outcomes = DataTable2['Class']
Y = Outcomes.values
# Split Data into Train (2/3) and Test (1/3)
DataTrain, DataTest, YTrain, YTest = train_test_split(DataTable2, Y, test_size=1/3, stratify=Y, random_state=85)
# Extract Train Data
XTrain = DataTrain[PeakList]
XTrainLog = np.log(XTrain)
XTrainScale, mu, sigma = cb.utils.scale(XTrainLog, method='auto', return_mu_sigma=True)
XTrainKnn = XTrainScale#cb.utils.knnimpute(XTrainScale, k=3)
# Extract Test Data
XTest = DataTest[PeakList]
XTestLog = np.log(XTest)
XTestScale = cb.utils.scale(XTestLog, method='auto', mu=mu, sigma=sigma)
XTestKnn = XTestScale# cb.utils.knnimpute(XTestScale, k=3)
# Parameter Dictionary
C_range = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]
param_dict = dict(C=C_range, kernel="linear")
# Initialise
cv = cb.cross_val.KFold(model=cb.model.SVM,
X=XTrainKnn,
Y=YTrain,
param_dict=param_dict,
folds=5,
n_mc=10,
n_cores=30)
# Run and Plot
cv.run()
cv.plot(metric='auc')
cv.plot(metric='r2q2')
Running ...
1/2: 100%|███████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 817.40it/s]
TerminatedWorkerError was raised due to excessive memory usage. n_cores was reduced to 1.
1/2: 0%| | 0/7 [00:00<?, ?it/s]
--------------------------------------------------------------------------- _RemoteTraceback Traceback (most recent call last) _RemoteTraceback: """ Traceback (most recent call last): File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 428, in _process_worker r = call_item() File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 275, in __call__ return self.fn(*self.args, **self.kwargs) File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 620, in __call__ return self.func(*args, **kwargs) File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py", line 289, in __call__ for func, args, kwargs in self.items] File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py", line 289, in <listcomp> for func, args, kwargs in self.items] File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/cross_val/KFold.py", line 113, in _calc_full_loop model_i.train(self.X, self.Y) File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/model/SVM.py", line 48, in train self.model.fit(X, Y) File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/svm/_base.py", line 196, in fit accept_large_sparse=False, File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/base.py", line 581, in _validate_data X, y = check_X_y(X, y, **check_params) File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py", line 976, in check_X_y estimator=estimator, File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py", line 800, in check_array _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan") File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py", line 116, in _assert_all_finite type_err, msg_dtype if msg_dtype is not None else X.dtype ValueError: Input contains NaN, infinity or a value too large for dtype('float64'). """ The above exception was the direct cause of the following exception: ValueError Traceback (most recent call last) ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/cross_val/KFold.py in calc_ypred(self) 61 try: ---> 62 full = Parallel(n_jobs=self.n_cores)(delayed(self._calc_full_loop)(i) for i in tqdm(range(len(self.param_list)), desc="1/2")) 63 except: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable) 1097 with self._backend.retrieval_context(): -> 1098 self.retrieve() 1099 # Make sure that we get a last message telling us we are done ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self) 974 if getattr(self._backend, 'supports_timeout', False): --> 975 self._output.extend(job.get(timeout=self.timeout)) 976 else: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout) 566 try: --> 567 return future.result(timeout=timeout) 568 except CfTimeoutError as e: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/concurrent/futures/_base.py in result(self, timeout) 431 elif self._state == FINISHED: --> 432 return self.__get_result() 433 else: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/concurrent/futures/_base.py in __get_result(self) 383 if self._exception: --> 384 raise self._exception 385 else: ValueError: Input contains NaN, infinity or a value too large for dtype('float64'). During handling of the above exception, another exception occurred: ValueError Traceback (most recent call last) /tmp/ipykernel_519792/704266850.py in <module> 13 14 # Run and Plot ---> 15 cv.run() 16 cv.plot(metric='auc') 17 cv.plot(metric='r2q2') ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/cross_val/BaseCrossVal.py in run(self) 100 print("returning stats at 'x' epoch interval during training until epoch={}.".format(epoch_list[-1])) 101 else: --> 102 self.calc_ypred() 103 self.calc_stats() 104 print("Done!") ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/cross_val/KFold.py in calc_ypred(self) 63 except: 64 print("TerminatedWorkerError was raised due to excessive memory usage. n_cores was reduced to 1.") ---> 65 full = Parallel(n_jobs=1)(delayed(self._calc_full_loop)(i) for i in tqdm(range(len(self.param_list)), desc="1/2")) 66 self.ypred_full = [] 67 self.x_scores_full = [] ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable) 1083 # remaining jobs. 1084 self._iterating = False -> 1085 if self.dispatch_one_batch(iterator): 1086 self._iterating = self._original_iterator is not None 1087 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator) 899 return False 900 else: --> 901 self._dispatch(tasks) 902 return True 903 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch) 817 with self._lock: 818 job_idx = len(self._jobs) --> 819 job = self._backend.apply_async(batch, callback=cb) 820 # A job can complete so quickly than its callback is 821 # called before we get here, causing self._jobs to ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback) 206 def apply_async(self, func, callback=None): 207 """Schedule a func to be run""" --> 208 result = ImmediateResult(func) 209 if callback: 210 callback(result) ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch) 595 # Don't delay the application, to avoid keeping the input 596 # arguments in memory --> 597 self.results = batch() 598 599 def get(self): ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in __call__(self) 287 with parallel_backend(self._backend, n_jobs=self._n_jobs): 288 return [func(*args, **kwargs) --> 289 for func, args, kwargs in self.items] 290 291 def __reduce__(self): ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0) 287 with parallel_backend(self._backend, n_jobs=self._n_jobs): 288 return [func(*args, **kwargs) --> 289 for func, args, kwargs in self.items] 290 291 def __reduce__(self): ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/cross_val/KFold.py in _calc_full_loop(self, i) 111 if model_i.__name__ == "cimcb.model.NN_SigmoidSigmoid" or model_i.__name__ == "cimcb.model.NN_SigmoidSigmoid": 112 model_i.compiled = False --> 113 model_i.train(self.X, self.Y) 114 ypred_full_i = model_i.test(self.X) 115 ypred_full = ypred_full_i ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/model/SVM.py in train(self, X, Y) 46 47 # Fit the model ---> 48 self.model.fit(X, Y) 49 50 # Predict_proba was designed for multi-groups... ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/svm/_base.py in fit(self, X, y, sample_weight) 194 order="C", 195 accept_sparse="csr", --> 196 accept_large_sparse=False, 197 ) 198 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params) 579 y = check_array(y, **check_y_params) 580 else: --> 581 X, y = check_X_y(X, y, **check_params) 582 out = X, y 583 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator) 974 ensure_min_samples=ensure_min_samples, 975 ensure_min_features=ensure_min_features, --> 976 estimator=estimator, 977 ) 978 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator) 798 799 if force_all_finite: --> 800 _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan") 801 802 if ensure_min_samples > 0: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype) 114 raise ValueError( 115 msg_err.format( --> 116 type_err, msg_dtype if msg_dtype is not None else X.dtype 117 ) 118 ) ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
# Clean PeakTable and Extract PeakList
PercMiss = PeakTable['Perc_missing']
PeakTableClean = PeakTable[(PercMiss < 20)]
PeakList = PeakTableClean['Name']
# Select Subset of Data
DataTable2 = miss_50_DataSet[(miss_50_DataSet.Class == 1) | (miss_50_DataSet.Class == 0)]
# Create a Binary Y Vector
Outcomes = DataTable2['Class']
Y = Outcomes.values
# Split Data into Train (2/3) and Test (1/3)
DataTrain, DataTest, YTrain, YTest = train_test_split(DataTable2, Y, test_size=1/3, stratify=Y, random_state=85)
# Extract Train Data
XTrain = DataTrain[PeakList]
XTrainLog = np.log(XTrain)
XTrainScale, mu, sigma = cb.utils.scale(XTrainLog, method='auto', return_mu_sigma=True)
XTrainKnn = XTrainScale#cb.utils.knnimpute(XTrainScale, k=3)
# Extract Test Data
XTest = DataTest[PeakList]
XTestLog = np.log(XTest)
XTestScale = cb.utils.scale(XTestLog, method='auto', mu=mu, sigma=sigma)
XTestKnn = XTestScale# cb.utils.knnimpute(XTestScale, k=3)
# Parameter Dictionary
C_range = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]
param_dict = dict(C=C_range, kernel="linear")
# Initialise
cv = cb.cross_val.KFold(model=cb.model.SVM,
X=XTrainKnn,
Y=YTrain,
param_dict=param_dict,
folds=5,
n_mc=10,
n_cores=30)
# Run and Plot
cv.run()
cv.plot(metric='auc')
cv.plot(metric='r2q2')
Running ...
1/2: 100%|██████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 3062.17it/s]
TerminatedWorkerError was raised due to excessive memory usage. n_cores was reduced to 1.
1/2: 0%| | 0/7 [00:00<?, ?it/s]
--------------------------------------------------------------------------- _RemoteTraceback Traceback (most recent call last) _RemoteTraceback: """ Traceback (most recent call last): File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 428, in _process_worker r = call_item() File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 275, in __call__ return self.fn(*self.args, **self.kwargs) File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 620, in __call__ return self.func(*args, **kwargs) File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py", line 289, in __call__ for func, args, kwargs in self.items] File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py", line 289, in <listcomp> for func, args, kwargs in self.items] File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/cross_val/KFold.py", line 113, in _calc_full_loop model_i.train(self.X, self.Y) File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/model/SVM.py", line 48, in train self.model.fit(X, Y) File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/svm/_base.py", line 196, in fit accept_large_sparse=False, File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/base.py", line 581, in _validate_data X, y = check_X_y(X, y, **check_params) File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py", line 976, in check_X_y estimator=estimator, File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py", line 800, in check_array _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan") File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py", line 116, in _assert_all_finite type_err, msg_dtype if msg_dtype is not None else X.dtype ValueError: Input contains NaN, infinity or a value too large for dtype('float64'). """ The above exception was the direct cause of the following exception: ValueError Traceback (most recent call last) ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/cross_val/KFold.py in calc_ypred(self) 61 try: ---> 62 full = Parallel(n_jobs=self.n_cores)(delayed(self._calc_full_loop)(i) for i in tqdm(range(len(self.param_list)), desc="1/2")) 63 except: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable) 1097 with self._backend.retrieval_context(): -> 1098 self.retrieve() 1099 # Make sure that we get a last message telling us we are done ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self) 974 if getattr(self._backend, 'supports_timeout', False): --> 975 self._output.extend(job.get(timeout=self.timeout)) 976 else: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout) 566 try: --> 567 return future.result(timeout=timeout) 568 except CfTimeoutError as e: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/concurrent/futures/_base.py in result(self, timeout) 431 elif self._state == FINISHED: --> 432 return self.__get_result() 433 else: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/concurrent/futures/_base.py in __get_result(self) 383 if self._exception: --> 384 raise self._exception 385 else: ValueError: Input contains NaN, infinity or a value too large for dtype('float64'). During handling of the above exception, another exception occurred: ValueError Traceback (most recent call last) /tmp/ipykernel_519792/704266850.py in <module> 13 14 # Run and Plot ---> 15 cv.run() 16 cv.plot(metric='auc') 17 cv.plot(metric='r2q2') ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/cross_val/BaseCrossVal.py in run(self) 100 print("returning stats at 'x' epoch interval during training until epoch={}.".format(epoch_list[-1])) 101 else: --> 102 self.calc_ypred() 103 self.calc_stats() 104 print("Done!") ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/cross_val/KFold.py in calc_ypred(self) 63 except: 64 print("TerminatedWorkerError was raised due to excessive memory usage. n_cores was reduced to 1.") ---> 65 full = Parallel(n_jobs=1)(delayed(self._calc_full_loop)(i) for i in tqdm(range(len(self.param_list)), desc="1/2")) 66 self.ypred_full = [] 67 self.x_scores_full = [] ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable) 1083 # remaining jobs. 1084 self._iterating = False -> 1085 if self.dispatch_one_batch(iterator): 1086 self._iterating = self._original_iterator is not None 1087 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator) 899 return False 900 else: --> 901 self._dispatch(tasks) 902 return True 903 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch) 817 with self._lock: 818 job_idx = len(self._jobs) --> 819 job = self._backend.apply_async(batch, callback=cb) 820 # A job can complete so quickly than its callback is 821 # called before we get here, causing self._jobs to ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback) 206 def apply_async(self, func, callback=None): 207 """Schedule a func to be run""" --> 208 result = ImmediateResult(func) 209 if callback: 210 callback(result) ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch) 595 # Don't delay the application, to avoid keeping the input 596 # arguments in memory --> 597 self.results = batch() 598 599 def get(self): ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in __call__(self) 287 with parallel_backend(self._backend, n_jobs=self._n_jobs): 288 return [func(*args, **kwargs) --> 289 for func, args, kwargs in self.items] 290 291 def __reduce__(self): ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0) 287 with parallel_backend(self._backend, n_jobs=self._n_jobs): 288 return [func(*args, **kwargs) --> 289 for func, args, kwargs in self.items] 290 291 def __reduce__(self): ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/cross_val/KFold.py in _calc_full_loop(self, i) 111 if model_i.__name__ == "cimcb.model.NN_SigmoidSigmoid" or model_i.__name__ == "cimcb.model.NN_SigmoidSigmoid": 112 model_i.compiled = False --> 113 model_i.train(self.X, self.Y) 114 ypred_full_i = model_i.test(self.X) 115 ypred_full = ypred_full_i ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/model/SVM.py in train(self, X, Y) 46 47 # Fit the model ---> 48 self.model.fit(X, Y) 49 50 # Predict_proba was designed for multi-groups... ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/svm/_base.py in fit(self, X, y, sample_weight) 194 order="C", 195 accept_sparse="csr", --> 196 accept_large_sparse=False, 197 ) 198 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params) 579 y = check_array(y, **check_y_params) 580 else: --> 581 X, y = check_X_y(X, y, **check_params) 582 out = X, y 583 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator) 974 ensure_min_samples=ensure_min_samples, 975 ensure_min_features=ensure_min_features, --> 976 estimator=estimator, 977 ) 978 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator) 798 799 if force_all_finite: --> 800 _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan") 801 802 if ensure_min_samples > 0: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype) 114 raise ValueError( 115 msg_err.format( --> 116 type_err, msg_dtype if msg_dtype is not None else X.dtype 117 ) 118 ) ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
# Clean PeakTable and Extract PeakList
PercMiss = PeakTable['Perc_missing']
PeakTableClean = PeakTable[(PercMiss < 20)]
PeakList = PeakTableClean['Name']
# Select Subset of Data
DataTable2 = change_10_DataSet[(change_10_DataSet.Class == 1) | (change_10_DataSet.Class == 0)]
# Create a Binary Y Vector
Outcomes = DataTable2['Class']
Y = Outcomes.values
# Split Data into Train (2/3) and Test (1/3)
DataTrain, DataTest, YTrain, YTest = train_test_split(DataTable2, Y, test_size=1/3, stratify=Y, random_state=85)
# Extract Train Data
XTrain = DataTrain[PeakList]
XTrainLog = np.log(XTrain)
XTrainScale, mu, sigma = cb.utils.scale(XTrainLog, method='auto', return_mu_sigma=True)
XTrainKnn = XTrainScale#cb.utils.knnimpute(XTrainScale, k=3)
# Extract Test Data
XTest = DataTest[PeakList]
XTestLog = np.log(XTest)
XTestScale = cb.utils.scale(XTestLog, method='auto', mu=mu, sigma=sigma)
XTestKnn = XTestScale# cb.utils.knnimpute(XTestScale, k=3)
# Parameter Dictionary
C_range = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]
param_dict = dict(C=C_range, kernel="linear")
# Initialise
cv = cb.cross_val.KFold(model=cb.model.SVM,
X=XTrainKnn,
Y=YTrain,
param_dict=param_dict,
folds=5,
n_mc=10,
n_cores=30)
# Run and Plot
cv.run()
cv.plot(metric='auc')
cv.plot(metric='r2q2')
Running ...
1/2: 100%|██████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 3022.46it/s]
TerminatedWorkerError was raised due to excessive memory usage. n_cores was reduced to 1.
1/2: 0%| | 0/7 [00:00<?, ?it/s]
--------------------------------------------------------------------------- _RemoteTraceback Traceback (most recent call last) _RemoteTraceback: """ Traceback (most recent call last): File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 428, in _process_worker r = call_item() File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 275, in __call__ return self.fn(*self.args, **self.kwargs) File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 620, in __call__ return self.func(*args, **kwargs) File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py", line 289, in __call__ for func, args, kwargs in self.items] File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py", line 289, in <listcomp> for func, args, kwargs in self.items] File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/cross_val/KFold.py", line 113, in _calc_full_loop model_i.train(self.X, self.Y) File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/model/SVM.py", line 48, in train self.model.fit(X, Y) File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/svm/_base.py", line 196, in fit accept_large_sparse=False, File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/base.py", line 581, in _validate_data X, y = check_X_y(X, y, **check_params) File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py", line 976, in check_X_y estimator=estimator, File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py", line 800, in check_array _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan") File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py", line 116, in _assert_all_finite type_err, msg_dtype if msg_dtype is not None else X.dtype ValueError: Input contains NaN, infinity or a value too large for dtype('float64'). """ The above exception was the direct cause of the following exception: ValueError Traceback (most recent call last) ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/cross_val/KFold.py in calc_ypred(self) 61 try: ---> 62 full = Parallel(n_jobs=self.n_cores)(delayed(self._calc_full_loop)(i) for i in tqdm(range(len(self.param_list)), desc="1/2")) 63 except: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable) 1097 with self._backend.retrieval_context(): -> 1098 self.retrieve() 1099 # Make sure that we get a last message telling us we are done ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self) 974 if getattr(self._backend, 'supports_timeout', False): --> 975 self._output.extend(job.get(timeout=self.timeout)) 976 else: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout) 566 try: --> 567 return future.result(timeout=timeout) 568 except CfTimeoutError as e: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/concurrent/futures/_base.py in result(self, timeout) 431 elif self._state == FINISHED: --> 432 return self.__get_result() 433 else: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/concurrent/futures/_base.py in __get_result(self) 383 if self._exception: --> 384 raise self._exception 385 else: ValueError: Input contains NaN, infinity or a value too large for dtype('float64'). During handling of the above exception, another exception occurred: ValueError Traceback (most recent call last) /tmp/ipykernel_519792/704266850.py in <module> 13 14 # Run and Plot ---> 15 cv.run() 16 cv.plot(metric='auc') 17 cv.plot(metric='r2q2') ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/cross_val/BaseCrossVal.py in run(self) 100 print("returning stats at 'x' epoch interval during training until epoch={}.".format(epoch_list[-1])) 101 else: --> 102 self.calc_ypred() 103 self.calc_stats() 104 print("Done!") ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/cross_val/KFold.py in calc_ypred(self) 63 except: 64 print("TerminatedWorkerError was raised due to excessive memory usage. n_cores was reduced to 1.") ---> 65 full = Parallel(n_jobs=1)(delayed(self._calc_full_loop)(i) for i in tqdm(range(len(self.param_list)), desc="1/2")) 66 self.ypred_full = [] 67 self.x_scores_full = [] ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable) 1083 # remaining jobs. 1084 self._iterating = False -> 1085 if self.dispatch_one_batch(iterator): 1086 self._iterating = self._original_iterator is not None 1087 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator) 899 return False 900 else: --> 901 self._dispatch(tasks) 902 return True 903 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch) 817 with self._lock: 818 job_idx = len(self._jobs) --> 819 job = self._backend.apply_async(batch, callback=cb) 820 # A job can complete so quickly than its callback is 821 # called before we get here, causing self._jobs to ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback) 206 def apply_async(self, func, callback=None): 207 """Schedule a func to be run""" --> 208 result = ImmediateResult(func) 209 if callback: 210 callback(result) ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch) 595 # Don't delay the application, to avoid keeping the input 596 # arguments in memory --> 597 self.results = batch() 598 599 def get(self): ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in __call__(self) 287 with parallel_backend(self._backend, n_jobs=self._n_jobs): 288 return [func(*args, **kwargs) --> 289 for func, args, kwargs in self.items] 290 291 def __reduce__(self): ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0) 287 with parallel_backend(self._backend, n_jobs=self._n_jobs): 288 return [func(*args, **kwargs) --> 289 for func, args, kwargs in self.items] 290 291 def __reduce__(self): ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/cross_val/KFold.py in _calc_full_loop(self, i) 111 if model_i.__name__ == "cimcb.model.NN_SigmoidSigmoid" or model_i.__name__ == "cimcb.model.NN_SigmoidSigmoid": 112 model_i.compiled = False --> 113 model_i.train(self.X, self.Y) 114 ypred_full_i = model_i.test(self.X) 115 ypred_full = ypred_full_i ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/model/SVM.py in train(self, X, Y) 46 47 # Fit the model ---> 48 self.model.fit(X, Y) 49 50 # Predict_proba was designed for multi-groups... ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/svm/_base.py in fit(self, X, y, sample_weight) 194 order="C", 195 accept_sparse="csr", --> 196 accept_large_sparse=False, 197 ) 198 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params) 579 y = check_array(y, **check_y_params) 580 else: --> 581 X, y = check_X_y(X, y, **check_params) 582 out = X, y 583 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator) 974 ensure_min_samples=ensure_min_samples, 975 ensure_min_features=ensure_min_features, --> 976 estimator=estimator, 977 ) 978 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator) 798 799 if force_all_finite: --> 800 _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan") 801 802 if ensure_min_samples > 0: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype) 114 raise ValueError( 115 msg_err.format( --> 116 type_err, msg_dtype if msg_dtype is not None else X.dtype 117 ) 118 ) ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
# Clean PeakTable and Extract PeakList
PercMiss = PeakTable['Perc_missing']
PeakTableClean = PeakTable[(PercMiss < 20)]
PeakList = PeakTableClean['Name']
# Select Subset of Data
DataTable2 = change_50_DataSet[(change_50_DataSet.Class == 1) | (change_50_DataSet.Class == 0)]
# Create a Binary Y Vector
Outcomes = DataTable2['Class']
Y = Outcomes.values
# Split Data into Train (2/3) and Test (1/3)
DataTrain, DataTest, YTrain, YTest = train_test_split(DataTable2, Y, test_size=1/3, stratify=Y, random_state=85)
# Extract Train Data
XTrain = DataTrain[PeakList]
XTrainLog = np.log(XTrain)
XTrainScale, mu, sigma = cb.utils.scale(XTrainLog, method='auto', return_mu_sigma=True)
XTrainKnn = XTrainScale#cb.utils.knnimpute(XTrainScale, k=3)
# Extract Test Data
XTest = DataTest[PeakList]
XTestLog = np.log(XTest)
XTestScale = cb.utils.scale(XTestLog, method='auto', mu=mu, sigma=sigma)
XTestKnn = XTestScale# cb.utils.knnimpute(XTestScale, k=3)
# Parameter Dictionary
C_range = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1]
param_dict = dict(C=C_range, kernel="linear")
# Initialise
cv = cb.cross_val.KFold(model=cb.model.SVM,
X=XTrainKnn,
Y=YTrain,
param_dict=param_dict,
folds=5,
n_mc=10,
n_cores=30)
# Run and Plot
cv.run()
cv.plot(metric='auc')
cv.plot(metric='r2q2')
Running ...
1/2: 100%|██████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 1210.08it/s]
TerminatedWorkerError was raised due to excessive memory usage. n_cores was reduced to 1.
1/2: 0%| | 0/7 [00:00<?, ?it/s]
--------------------------------------------------------------------------- _RemoteTraceback Traceback (most recent call last) _RemoteTraceback: """ Traceback (most recent call last): File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 428, in _process_worker r = call_item() File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 275, in __call__ return self.fn(*self.args, **self.kwargs) File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 620, in __call__ return self.func(*args, **kwargs) File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py", line 289, in __call__ for func, args, kwargs in self.items] File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py", line 289, in <listcomp> for func, args, kwargs in self.items] File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/cross_val/KFold.py", line 113, in _calc_full_loop model_i.train(self.X, self.Y) File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/model/SVM.py", line 48, in train self.model.fit(X, Y) File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/svm/_base.py", line 196, in fit accept_large_sparse=False, File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/base.py", line 581, in _validate_data X, y = check_X_y(X, y, **check_params) File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py", line 976, in check_X_y estimator=estimator, File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py", line 800, in check_array _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan") File "/home/anepal/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py", line 116, in _assert_all_finite type_err, msg_dtype if msg_dtype is not None else X.dtype ValueError: Input contains NaN, infinity or a value too large for dtype('float64'). """ The above exception was the direct cause of the following exception: ValueError Traceback (most recent call last) ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/cross_val/KFold.py in calc_ypred(self) 61 try: ---> 62 full = Parallel(n_jobs=self.n_cores)(delayed(self._calc_full_loop)(i) for i in tqdm(range(len(self.param_list)), desc="1/2")) 63 except: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable) 1097 with self._backend.retrieval_context(): -> 1098 self.retrieve() 1099 # Make sure that we get a last message telling us we are done ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self) 974 if getattr(self._backend, 'supports_timeout', False): --> 975 self._output.extend(job.get(timeout=self.timeout)) 976 else: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout) 566 try: --> 567 return future.result(timeout=timeout) 568 except CfTimeoutError as e: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/concurrent/futures/_base.py in result(self, timeout) 431 elif self._state == FINISHED: --> 432 return self.__get_result() 433 else: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/concurrent/futures/_base.py in __get_result(self) 383 if self._exception: --> 384 raise self._exception 385 else: ValueError: Input contains NaN, infinity or a value too large for dtype('float64'). During handling of the above exception, another exception occurred: ValueError Traceback (most recent call last) /tmp/ipykernel_519792/704266850.py in <module> 13 14 # Run and Plot ---> 15 cv.run() 16 cv.plot(metric='auc') 17 cv.plot(metric='r2q2') ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/cross_val/BaseCrossVal.py in run(self) 100 print("returning stats at 'x' epoch interval during training until epoch={}.".format(epoch_list[-1])) 101 else: --> 102 self.calc_ypred() 103 self.calc_stats() 104 print("Done!") ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/cross_val/KFold.py in calc_ypred(self) 63 except: 64 print("TerminatedWorkerError was raised due to excessive memory usage. n_cores was reduced to 1.") ---> 65 full = Parallel(n_jobs=1)(delayed(self._calc_full_loop)(i) for i in tqdm(range(len(self.param_list)), desc="1/2")) 66 self.ypred_full = [] 67 self.x_scores_full = [] ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable) 1083 # remaining jobs. 1084 self._iterating = False -> 1085 if self.dispatch_one_batch(iterator): 1086 self._iterating = self._original_iterator is not None 1087 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator) 899 return False 900 else: --> 901 self._dispatch(tasks) 902 return True 903 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch) 817 with self._lock: 818 job_idx = len(self._jobs) --> 819 job = self._backend.apply_async(batch, callback=cb) 820 # A job can complete so quickly than its callback is 821 # called before we get here, causing self._jobs to ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback) 206 def apply_async(self, func, callback=None): 207 """Schedule a func to be run""" --> 208 result = ImmediateResult(func) 209 if callback: 210 callback(result) ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch) 595 # Don't delay the application, to avoid keeping the input 596 # arguments in memory --> 597 self.results = batch() 598 599 def get(self): ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in __call__(self) 287 with parallel_backend(self._backend, n_jobs=self._n_jobs): 288 return [func(*args, **kwargs) --> 289 for func, args, kwargs in self.items] 290 291 def __reduce__(self): ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0) 287 with parallel_backend(self._backend, n_jobs=self._n_jobs): 288 return [func(*args, **kwargs) --> 289 for func, args, kwargs in self.items] 290 291 def __reduce__(self): ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/cross_val/KFold.py in _calc_full_loop(self, i) 111 if model_i.__name__ == "cimcb.model.NN_SigmoidSigmoid" or model_i.__name__ == "cimcb.model.NN_SigmoidSigmoid": 112 model_i.compiled = False --> 113 model_i.train(self.X, self.Y) 114 ypred_full_i = model_i.test(self.X) 115 ypred_full = ypred_full_i ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/model/SVM.py in train(self, X, Y) 46 47 # Fit the model ---> 48 self.model.fit(X, Y) 49 50 # Predict_proba was designed for multi-groups... ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/svm/_base.py in fit(self, X, y, sample_weight) 194 order="C", 195 accept_sparse="csr", --> 196 accept_large_sparse=False, 197 ) 198 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params) 579 y = check_array(y, **check_y_params) 580 else: --> 581 X, y = check_X_y(X, y, **check_params) 582 out = X, y 583 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator) 974 ensure_min_samples=ensure_min_samples, 975 ensure_min_features=ensure_min_features, --> 976 estimator=estimator, 977 ) 978 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator) 798 799 if force_all_finite: --> 800 _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan") 801 802 if ensure_min_samples > 0: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype) 114 raise ValueError( 115 msg_err.format( --> 116 type_err, msg_dtype if msg_dtype is not None else X.dtype 117 ) 118 ) ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
# Build Model
model = cb.model.SVM(C=0.0005, kernel="linear")
YPredTrain = model.train(XTrainKnn, YTrain)
YPredTest = model.test(XTestKnn)
# Put YTrain and YPredTrain in a List
EvalTrain = [YTrain, YPredTrain]
# Put YTest and YPrestTest in a List
EvalTest = [YTest, YPredTest]
# Evaluate Model (include Test Dataset)
model.evaluate(testset=EvalTest)
1/2: 0%| | 0/7 [02:03<?, ?it/s] 1/2: 0%| | 0/7 [01:24<?, ?it/s]
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) /tmp/ipykernel_519792/3108593830.py in <module> 1 # Build Model 2 model = cb.model.SVM(C=0.0005, kernel="linear") ----> 3 YPredTrain = model.train(XTrainKnn, YTrain) 4 YPredTest = model.test(XTestKnn) 5 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/cimcb/model/SVM.py in train(self, X, Y) 46 47 # Fit the model ---> 48 self.model.fit(X, Y) 49 50 # Predict_proba was designed for multi-groups... ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/svm/_base.py in fit(self, X, y, sample_weight) 194 order="C", 195 accept_sparse="csr", --> 196 accept_large_sparse=False, 197 ) 198 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/base.py in _validate_data(self, X, y, reset, validate_separately, **check_params) 579 y = check_array(y, **check_y_params) 580 else: --> 581 X, y = check_X_y(X, y, **check_params) 582 out = X, y 583 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator) 974 ensure_min_samples=ensure_min_samples, 975 ensure_min_features=ensure_min_features, --> 976 estimator=estimator, 977 ) 978 ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator) 798 799 if force_all_finite: --> 800 _assert_all_finite(array, allow_nan=force_all_finite == "allow-nan") 801 802 if ensure_min_samples > 0: ~/.conda/envs/process_env/envs/MetabComparisonBinaryML/lib/python3.7/site-packages/sklearn/utils/validation.py in _assert_all_finite(X, allow_nan, msg_dtype) 114 raise ValueError( 115 msg_err.format( --> 116 type_err, msg_dtype if msg_dtype is not None else X.dtype 117 ) 118 ) ValueError: Input contains NaN, infinity or a value too large for dtype('float64').